# Clear memory
rm(list = ls())

# Load packages
if (!require("pacman")) install.packages("pacman")
pacman::p_load(
  tidyverse, tidylog, data.table, janitor,
  magrittr, lubridate,
  fixest, broom, haven, arrow
)
options("tidylog.display" = NULL)
select = dplyr::select

migration = read_parquet("data/IRS/migration_shares_full.pq") %>% 
  filter(year < 2002)

# wages and nonattainment
df_o = fread("data/data_full_panel_time_series.csv") %>% 
  select(origin = fips, year, starts_with("nonattainment"), ma, pay, emp, -nonattainment) %>% 
  drop_na(origin) %>% 
  rowwise() %>%
  mutate(origin_nonattainment = max(
    nonattainment_o3, nonattainment_co, nonattainment_pm10, nonattainment_so2
  )) %>% 
  group_by(origin, year) %>% 
  mutate(
    origin_wage = sum(pay)/sum(emp), 
    origin_ma = first(ma)
  ) %>% 
  rename_with(~str_c("origin_", .), .cols = starts_with("nonattainment")) %>% 
  distinct(origin, year, .keep_all = TRUE) %>% 
    filter(year < 2002)

# compute first year of nonattainment for counties that go into nonattainment
first_o = df_o %>%
  distinct(origin, year, origin_nonattainment) %>% 
  filter(origin_nonattainment == 1) %>% 
  arrange(origin, origin_nonattainment, year) %>%
  group_by(origin) %>%
  mutate(first_nonattain_orig = first(year)) %>%
  mutate(first_nonattain_orig = ifelse(first_nonattain_orig == 1990, 1991, first_nonattain_orig)) %>% 
  filter(origin_nonattainment == 1) %>% 
  select(origin, first_nonattain_orig) %>%
  distinct()

# wages and nonattainment
df_d = fread("data/data_full_panel_time_series.csv") %>% 
  select(destination = fips, year, starts_with("nonattainment"), ma, pay, emp, -nonattainment) %>% 
  drop_na(destination) %>% 
  rowwise() %>%
  mutate(dest_nonattainment = max(
    nonattainment_o3, nonattainment_co, nonattainment_pm10, nonattainment_so2
  )) %>% 
  group_by(destination, year) %>% 
  mutate(
    dest_wage = sum(pay)/sum(emp), 
    dest_ma = first(ma)
  ) %>% 
  rename_with(~str_c("dest_", .), .cols = starts_with("nonattainment")) %>% 
  distinct(destination, year, .keep_all = TRUE) %>% 
  filter(year < 2002)

# compute first year of nonattainment for counties that go into nonattainment
first_d = df_d %>%
  distinct(destination, year, dest_nonattainment) %>% 
  filter(dest_nonattainment == 1) %>% 
  arrange(destination, dest_nonattainment, year) %>%
  group_by(destination) %>%
  mutate(first_nonattain_dest = first(year)) %>%
  mutate(first_nonattain_dest = ifelse(first_nonattain_dest == 1990, 1991, first_nonattain_dest)) %>% 
  filter(dest_nonattainment == 1) %>% 
  select(destination, first_nonattain_dest) %>%
  distinct() 

df_o = df_o %>% as.data.table()
df_d = df_d %>% as.data.table()
first_o = first_o %>% as.data.table()
first_d = first_d %>% as.data.table()

full_panel = merge(migration, df_o, by = c("origin", "year"))
rm(migration)
full_panel = merge(full_panel, df_d, by = c("destination", "year"))
full_panel = merge(full_panel, first_o, by = c("origin"), all.x = TRUE)
full_panel = merge(full_panel, first_d, by = c("destination"), all.x = TRUE)
full_panel = full_panel %>% as.data.table()

# fill in zeros for first nonattainment year for never nonattainment counties
full_panel$first_nonattain_dest[is.na(full_panel$first_nonattain_dest)] = 0
full_panel$first_nonattain_orig[is.na(full_panel$first_nonattain_orig)] = 0

# county is treated if its first nonattainment is after 1990 CAAAs, and its at least as late as its first nonattainment year
full_panel[, `:=`(treated_dest = first_nonattain_dest > 1990 & year > 1990 & year >= first_nonattain_dest,
                  treated_orig = first_nonattain_orig > 1990 & year > 1990 & year >= first_nonattain_orig)]


# keep only observations valid in PPML to reduce final dataset size
model = fepois((share/initial_own_share) ~
                 I(treated_dest - treated_orig) |
                 origin^destination + year, 
               data = full_panel,
               offset = ~ log(((dest_wage/origin_wage)/(dest_ma/origin_ma)^(-1/4)))
)

full_panel = full_panel[model$obs_selection$obsRemoved,]
gc(verbose = TRUE)

final_panel = full_panel %>%
  ungroup() %>%
  mutate(first_non_1990 = ifelse(first_nonattain_dest >= 1991, first_nonattain_dest, NA),
         event_time_dest = year - first_non_1990,
         event_time_dest = ifelse(!is.na(first_non_1990), event_time_dest, -1),
         ones = 1) %>%
  group_by(event_time_dest) %>% 
  mutate(event_obs = n()) %>% 
  ungroup() %>% 
  mutate(balanced_time = event_obs == sort(unique(event_obs), decreasing = TRUE)[2]) %>% 
  group_by(balanced_time) %>% 
  mutate(upper_cap = max(event_time_dest),
         upper_cap = ifelse(upper_cap > 5, 5, upper_cap),
         lower_cap = min(event_time_dest), 
         lower_cap = ifelse(lower_cap < -5, -5, lower_cap)) %>% 
  mutate(upper_cap = 4,
         lower_cap = -1) %>%
  ungroup() %>% 
  mutate(upper_cap = min(upper_cap),
         lower_cap = max(lower_cap)) %>% 
  mutate(
    event_time_dest = case_when(
      event_time_dest > upper_cap ~ upper_cap + 1,
      event_time_dest < lower_cap ~ lower_cap - 1,
      TRUE ~ event_time_dest
    )
  )

final_panel = final_panel %>%
  ungroup() %>%
  mutate(first_non_1990 = ifelse(first_nonattain_orig >= 1991, first_nonattain_orig, NA),
         event_time_orig = year - first_non_1990,
         event_time_orig = ifelse(!is.na(first_non_1990), event_time_orig, -1),
         ones = 1) %>%
  mutate(
    event_time_orig = case_when(
      event_time_orig > 5 ~ 5,
      event_time_orig < -5 ~ -5,
      TRUE ~ event_time_orig
    )
  ) %>%
  group_by(event_time_orig) %>% 
  mutate(event_obs = n()) %>% 
  ungroup() %>% 
  mutate(balanced_time = event_obs == sort(unique(event_obs), decreasing = TRUE)[2]) %>% 
  group_by(balanced_time) %>% 
  mutate(upper_cap = max(event_time_orig),
         upper_cap = ifelse(upper_cap > 5, 5, upper_cap),
         lower_cap = min(event_time_orig), 
         lower_cap = ifelse(lower_cap < -5, -5, lower_cap)) %>% 
  mutate(upper_cap = 4,
         lower_cap = -1) %>%
  ungroup() %>% 
  mutate(upper_cap = min(upper_cap),
         lower_cap = max(lower_cap)) %>% 
  mutate(
    event_time_orig = case_when(
      event_time_orig > upper_cap ~ upper_cap + 1,
      event_time_orig < lower_cap ~ lower_cap - 1,
      TRUE ~ event_time_orig
    )
  )

final_panel = final_panel %>% 
  mutate(treated_orig = as.factor(event_time_orig >= 1),
         treated_dest = as.factor(event_time_dest >= 1))

# save panel that has all the observations that work for PPML with FEs
fwrite(final_panel, "data/amenity_panel.csv")
